library(ngsReports)
library(edgeR)
library(magrittr)
library(scales)
library(tidyverse)
library(ggrepel)
library(pander)
theme_set(theme_bw())
rawFqc <- list.files("../0_rawData/FastQC/", pattern = "zip$", full.names = TRUE) %>%
getFastqcData()
trimFqc <- list.files("../1_trimmedData/FastQC/", pattern = "zip$", full.names = TRUE) %>%
getFastqcData()
alnFqc <- list.files("../2_alignedData/FastQC/", pattern = "zip$", full.names = TRUE) %>%
getFastqcData()
rt <- list(
readTotals(rawFqc) %>% mutate(Stage = "Raw"),
readTotals(trimFqc) %>% mutate(Stage = "Trimmed"),
readTotals(alnFqc) %>% mutate(Stage = "Aligned")
) %>%
bind_rows() %>%
mutate(Sample = str_remove_all(Filename, "(_R1.fq.gz|Aligned.sortedByCoord.out.bam)"),
Sample = factor(Sample, levels = str_sort(unique(Sample), numeric = TRUE)),
Stage = factor(Stage, levels = c("Raw", "Trimmed", "Aligned")))
Summary of library sizes after each step. The slight increase after alignments indicates a low level of multiple alignments
No unusual behaviour was found in GC content.
GC content for raw data
GC content for aligned data
No issues were evident with regard to duplication levels
Duplication levels for raw data
Duplication levels for aligned data
Alignment rates were also very good
alnLogs <- list.files("../2_alignedData/logs/", full.names = TRUE, pattern = "final.out") %>%
importStarLogs()
Alignment summaries for all libraries (before merging replicates)
Gene-level counts were loaded without filtering and an MDS plot generated in order to check whether replicate libraries grouped correctly.
counts <- read_tsv("../2_alignedData/featureCounts/genes.out") %>%
set_names(basename(colnames(.))) %>%
set_names(str_remove_all(colnames(.), "Aligned.sortedByCoord.out.bam"))
dge <- counts %>%
as.data.frame() %>%
column_to_rownames("Geneid") %>%
DGEList() %>%
calcNormFactors()
dge$samples %<>%
rownames_to_column("library") %>%
mutate(Genotype = str_extract_all(library, "(FAD|FS|__)"),
Genotype = str_replace_all(Genotype, "__", "WT"),
Genotype = factor(Genotype, levels = c("WT", "FS", "FAD")),
Sample = str_replace(library, "__-", "_WT"),
Sample = ifelse(str_count(Sample, "_") == 3, Sample, paste0(Sample, "_1")),
Replicate = str_extract(Sample, "[12]$"),
Sample = str_remove(Sample, "_[12]$"),
group = as.integer(Genotype)) %>%
column_to_rownames("library")
mds <- plotMDS(dge, plot = FALSE) %>%
extract2("cmdscale.out") %>%
set_colnames(paste0("Dim", 1:2)) %>%
as.data.frame() %>%
rownames_to_column("Library") %>%
as_tibble()
Plot of replicate libraries showing good concordance between replicates. These will be combined and low-expressed genes removed for the DE analysis. However, minimal separation between groups was noted in this pre-filtered stage.
Library sizes after summarising to the gene-level were also checked.
counts <- read_tsv("../2_alignedData/featureCounts/genes.out") %>%
set_names(basename(colnames(.))) %>%
set_names(str_remove_all(colnames(.), "Aligned.sortedByCoord.out.bam"))
Total counts from each library
| Library | Total Alignments | Counts | Percent |
|---|---|---|---|
| 13__-_3 | 17,605,122 | 11,393,459 | 64.72% |
| 2_FAD_4 | 23,157,372 | 16,034,737 | 69.24% |